import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image, display
CMPINF2120_EPM_FUNC_INCL_Over_Lisa.ipynb includes functions used in this notebook.
%run CMPINF2120_EPM_FUNC_INCL_Over_Lisa.ipynb
interim_sqrt_path = 'https://raw.githubusercontent.com/lisaover/CMPINF2120_project/main/tp_sqrt_inputs_interim_df.csv'
interim_sqrt_init = pd.read_csv(interim_sqrt_path)
interim_sqrt_init.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3642 entries, 0 to 3641 Data columns (total 83 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sess 3642 non-null int64 1 sid 3642 non-null int64 2 actv_grp 3642 non-null object 3 total_ms_tp000_sqrt 3642 non-null float64 4 mw_tp000_sqrt 3642 non-null float64 5 mwc_tp000_sqrt 3642 non-null float64 6 mcl_tp000_sqrt 3642 non-null float64 7 mcr_tp000_sqrt 3642 non-null float64 8 mm_tp000_sqrt 3642 non-null float64 9 ks_tp000_sqrt 3642 non-null float64 10 total_ms_tp010_sqrt 3642 non-null float64 11 mw_tp010_sqrt 3642 non-null float64 12 mwc_tp010_sqrt 3642 non-null float64 13 mcl_tp010_sqrt 3642 non-null float64 14 mcr_tp010_sqrt 3642 non-null float64 15 mm_tp010_sqrt 3642 non-null float64 16 ks_tp010_sqrt 3642 non-null float64 17 total_ms_tp020_sqrt 3642 non-null float64 18 mw_tp020_sqrt 3642 non-null float64 19 mwc_tp020_sqrt 3642 non-null float64 20 mcl_tp020_sqrt 3642 non-null float64 21 mcr_tp020_sqrt 3642 non-null float64 22 mm_tp020_sqrt 3642 non-null float64 23 ks_tp020_sqrt 3642 non-null float64 24 total_ms_tp030_sqrt 3642 non-null float64 25 mw_tp030_sqrt 3642 non-null float64 26 mwc_tp030_sqrt 3642 non-null float64 27 mcl_tp030_sqrt 3642 non-null float64 28 mcr_tp030_sqrt 3642 non-null float64 29 mm_tp030_sqrt 3642 non-null float64 30 ks_tp030_sqrt 3642 non-null float64 31 total_ms_tp040_sqrt 3642 non-null float64 32 mw_tp040_sqrt 3642 non-null float64 33 mwc_tp040_sqrt 3642 non-null float64 34 mcl_tp040_sqrt 3642 non-null float64 35 mcr_tp040_sqrt 3642 non-null float64 36 mm_tp040_sqrt 3642 non-null float64 37 ks_tp040_sqrt 3642 non-null float64 38 total_ms_tp050_sqrt 3642 non-null float64 39 mw_tp050_sqrt 3642 non-null float64 40 mwc_tp050_sqrt 3642 non-null float64 41 mcl_tp050_sqrt 3642 non-null float64 42 mcr_tp050_sqrt 3642 non-null float64 43 mm_tp050_sqrt 3642 non-null float64 44 ks_tp050_sqrt 3642 non-null float64 45 total_ms_tp060_sqrt 3642 non-null float64 46 mw_tp060_sqrt 3642 non-null float64 47 mwc_tp060_sqrt 3642 non-null float64 48 mcl_tp060_sqrt 3642 non-null float64 49 mcr_tp060_sqrt 3642 non-null float64 50 mm_tp060_sqrt 3642 non-null float64 51 ks_tp060_sqrt 3642 non-null float64 52 total_ms_tp070_sqrt 3642 non-null float64 53 mw_tp070_sqrt 3642 non-null float64 54 mwc_tp070_sqrt 3642 non-null float64 55 mcl_tp070_sqrt 3642 non-null float64 56 mcr_tp070_sqrt 3642 non-null float64 57 mm_tp070_sqrt 3642 non-null float64 58 ks_tp070_sqrt 3642 non-null float64 59 total_ms_tp080_sqrt 3642 non-null float64 60 mw_tp080_sqrt 3642 non-null float64 61 mwc_tp080_sqrt 3642 non-null float64 62 mcl_tp080_sqrt 3642 non-null float64 63 mcr_tp080_sqrt 3642 non-null float64 64 mm_tp080_sqrt 3642 non-null float64 65 ks_tp080_sqrt 3642 non-null float64 66 total_ms_tp090_sqrt 3642 non-null float64 67 mw_tp090_sqrt 3642 non-null float64 68 mwc_tp090_sqrt 3642 non-null float64 69 mcl_tp090_sqrt 3642 non-null float64 70 mcr_tp090_sqrt 3642 non-null float64 71 mm_tp090_sqrt 3642 non-null float64 72 ks_tp090_sqrt 3642 non-null float64 73 total_ms_tp100_sqrt 3642 non-null float64 74 mw_tp100_sqrt 3642 non-null float64 75 mwc_tp100_sqrt 3642 non-null float64 76 mcl_tp100_sqrt 3642 non-null float64 77 mcr_tp100_sqrt 3642 non-null float64 78 mm_tp100_sqrt 3642 non-null float64 79 ks_tp100_sqrt 3642 non-null float64 80 interim_scr 3642 non-null float64 81 max_interim_scr 3642 non-null float64 82 interim_pass 3642 non-null float64 dtypes: float64(80), int64(2), object(1) memory usage: 2.3+ MB
interim_sqrt_init.isna().sum()
sess 0
sid 0
actv_grp 0
total_ms_tp000_sqrt 0
mw_tp000_sqrt 0
..
mm_tp100_sqrt 0
ks_tp100_sqrt 0
interim_scr 0
max_interim_scr 0
interim_pass 0
Length: 83, dtype: int64
interim_sqrt_init['sid'] = interim_sqrt_init['sid'].astype('object') interim_sqrt_init['sess'] = interim_sqrt_init['sess'].astype('object')
interim_sqrt_df = interim_sqrt_init.copy()
sqrt_vars = get_var_list(interim_sqrt_df,['sqrt'])
totl_vars = get_var_list_b(interim_sqrt_df,['total'])
mw_vars = get_var_list_b(interim_sqrt_df,['mw_'])
mwc_vars = get_var_list_b(interim_sqrt_df,['mwc'])
mcl_vars = get_var_list_b(interim_sqrt_df,['mcl'])
mcr_vars = get_var_list_b(interim_sqrt_df,['mcr'])
mm_vars = get_var_list_b(interim_sqrt_df,['mm'])
ks_vars = get_var_list_b(interim_sqrt_df,['ks'])
features_df = interim_sqrt_df.loc[:, sqrt_vars].copy()
features_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3642 entries, 0 to 3641 Data columns (total 77 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 total_ms_tp000_sqrt 3642 non-null float64 1 mw_tp000_sqrt 3642 non-null float64 2 mwc_tp000_sqrt 3642 non-null float64 3 mcl_tp000_sqrt 3642 non-null float64 4 mcr_tp000_sqrt 3642 non-null float64 5 mm_tp000_sqrt 3642 non-null float64 6 ks_tp000_sqrt 3642 non-null float64 7 total_ms_tp010_sqrt 3642 non-null float64 8 mw_tp010_sqrt 3642 non-null float64 9 mwc_tp010_sqrt 3642 non-null float64 10 mcl_tp010_sqrt 3642 non-null float64 11 mcr_tp010_sqrt 3642 non-null float64 12 mm_tp010_sqrt 3642 non-null float64 13 ks_tp010_sqrt 3642 non-null float64 14 total_ms_tp020_sqrt 3642 non-null float64 15 mw_tp020_sqrt 3642 non-null float64 16 mwc_tp020_sqrt 3642 non-null float64 17 mcl_tp020_sqrt 3642 non-null float64 18 mcr_tp020_sqrt 3642 non-null float64 19 mm_tp020_sqrt 3642 non-null float64 20 ks_tp020_sqrt 3642 non-null float64 21 total_ms_tp030_sqrt 3642 non-null float64 22 mw_tp030_sqrt 3642 non-null float64 23 mwc_tp030_sqrt 3642 non-null float64 24 mcl_tp030_sqrt 3642 non-null float64 25 mcr_tp030_sqrt 3642 non-null float64 26 mm_tp030_sqrt 3642 non-null float64 27 ks_tp030_sqrt 3642 non-null float64 28 total_ms_tp040_sqrt 3642 non-null float64 29 mw_tp040_sqrt 3642 non-null float64 30 mwc_tp040_sqrt 3642 non-null float64 31 mcl_tp040_sqrt 3642 non-null float64 32 mcr_tp040_sqrt 3642 non-null float64 33 mm_tp040_sqrt 3642 non-null float64 34 ks_tp040_sqrt 3642 non-null float64 35 total_ms_tp050_sqrt 3642 non-null float64 36 mw_tp050_sqrt 3642 non-null float64 37 mwc_tp050_sqrt 3642 non-null float64 38 mcl_tp050_sqrt 3642 non-null float64 39 mcr_tp050_sqrt 3642 non-null float64 40 mm_tp050_sqrt 3642 non-null float64 41 ks_tp050_sqrt 3642 non-null float64 42 total_ms_tp060_sqrt 3642 non-null float64 43 mw_tp060_sqrt 3642 non-null float64 44 mwc_tp060_sqrt 3642 non-null float64 45 mcl_tp060_sqrt 3642 non-null float64 46 mcr_tp060_sqrt 3642 non-null float64 47 mm_tp060_sqrt 3642 non-null float64 48 ks_tp060_sqrt 3642 non-null float64 49 total_ms_tp070_sqrt 3642 non-null float64 50 mw_tp070_sqrt 3642 non-null float64 51 mwc_tp070_sqrt 3642 non-null float64 52 mcl_tp070_sqrt 3642 non-null float64 53 mcr_tp070_sqrt 3642 non-null float64 54 mm_tp070_sqrt 3642 non-null float64 55 ks_tp070_sqrt 3642 non-null float64 56 total_ms_tp080_sqrt 3642 non-null float64 57 mw_tp080_sqrt 3642 non-null float64 58 mwc_tp080_sqrt 3642 non-null float64 59 mcl_tp080_sqrt 3642 non-null float64 60 mcr_tp080_sqrt 3642 non-null float64 61 mm_tp080_sqrt 3642 non-null float64 62 ks_tp080_sqrt 3642 non-null float64 63 total_ms_tp090_sqrt 3642 non-null float64 64 mw_tp090_sqrt 3642 non-null float64 65 mwc_tp090_sqrt 3642 non-null float64 66 mcl_tp090_sqrt 3642 non-null float64 67 mcr_tp090_sqrt 3642 non-null float64 68 mm_tp090_sqrt 3642 non-null float64 69 ks_tp090_sqrt 3642 non-null float64 70 total_ms_tp100_sqrt 3642 non-null float64 71 mw_tp100_sqrt 3642 non-null float64 72 mwc_tp100_sqrt 3642 non-null float64 73 mcl_tp100_sqrt 3642 non-null float64 74 mcr_tp100_sqrt 3642 non-null float64 75 mm_tp100_sqrt 3642 non-null float64 76 ks_tp100_sqrt 3642 non-null float64 dtypes: float64(77) memory usage: 2.1 MB
feature_names = features_df.columns
len(feature_names)
77
interim_sqrt_df = interim_sqrt_init.copy()
interim_sqrt_df.loc[interim_sqrt_df.sess==2].actv_grp.unique()
array(['Aulaweb', 'Blank', 'Deeds', 'Diagram', 'Other', 'Properties',
'Study', 'TextEditor', 'FSM_Related', 'Study_Materials'],
dtype=object)
interim_sqrt_df.loc[interim_sqrt_df.sess==2].actv_grp.nunique()
10
interim_sqrt_df.loc[interim_sqrt_df.sess==3].actv_grp.unique()
array(['Aulaweb', 'Blank', 'Deeds', 'Diagram', 'Other', 'Properties',
'Study', 'TextEditor', 'FSM_Related', 'Study_Materials'],
dtype=object)
interim_sqrt_df.loc[interim_sqrt_df.sess==3].actv_grp.nunique()
10
interim_sqrt_df.loc[interim_sqrt_df.sess==4].actv_grp.unique()
array(['Aulaweb', 'Blank', 'Deeds', 'Diagram', 'FSM_Related', 'Other',
'Properties', 'Study', 'TextEditor', 'Study_Materials'],
dtype=object)
interim_sqrt_df.loc[interim_sqrt_df.sess==4].actv_grp.nunique()
10
interim_sqrt_df.loc[interim_sqrt_df.sess==5].actv_grp.unique()
array(['Aulaweb', 'Blank', 'Deeds', 'Diagram', 'Other', 'Properties',
'Study', 'TextEditor', 'Study_Materials', 'FSM_Related'],
dtype=object)
interim_sqrt_df.loc[interim_sqrt_df.sess==5].actv_grp.nunique()
10
interim_sqrt_df.loc[interim_sqrt_df.sess==6].actv_grp.unique()
array(['Aulaweb', 'Blank', 'Deeds', 'Diagram', 'FSM', 'FSM_Related',
'Other', 'Properties', 'Study', 'TextEditor', 'Study_Materials'],
dtype=object)
interim_sqrt_df.loc[interim_sqrt_df.sess==6].actv_grp.nunique()
11
sns.catplot(data = interim_sqrt_df.loc[interim_sqrt_df['sess']==2], x='interim_pass', kind='count')
plt.show()
interim_sqrt_df.loc[interim_sqrt_df['sess']==2].interim_pass.mean()
0.41960183767228176
sns.catplot(data = interim_sqrt_df.loc[interim_sqrt_df['sess']==3], x='interim_pass', kind='count')
plt.show()
interim_sqrt_df.loc[interim_sqrt_df['sess']==3].interim_pass.mean()
0.5833333333333334
sns.catplot(data = interim_sqrt_df.loc[interim_sqrt_df['sess']==4], x='interim_pass', kind='count')
plt.show()
interim_sqrt_df.loc[interim_sqrt_df['sess']==4].interim_pass.mean()
0.9895561357702349
sns.catplot(data = interim_sqrt_df.loc[interim_sqrt_df['sess']==5], x='interim_pass', kind='count')
plt.show()
interim_sqrt_df.loc[interim_sqrt_df['sess']==5].interim_pass.mean()
0.957004160887656
sns.catplot(data = interim_sqrt_df.loc[interim_sqrt_df['sess']==6], x='interim_pass', kind='count')
plt.show()
interim_sqrt_df.loc[interim_sqrt_df['sess']==6].interim_pass.mean()
0.26772616136919314
interim_sqrt_lf = interim_sqrt_df.melt(id_vars=['sess', 'sid', 'actv_grp', 'interim_scr', 'max_interim_scr', 'interim_pass'], value_vars=feature_names, ignore_index=True)
interim_sqrt_lf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 280434 entries, 0 to 280433 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sess 280434 non-null int64 1 sid 280434 non-null int64 2 actv_grp 280434 non-null object 3 interim_scr 280434 non-null float64 4 max_interim_scr 280434 non-null float64 5 interim_pass 280434 non-null float64 6 variable 280434 non-null object 7 value 280434 non-null float64 dtypes: float64(4), int64(2), object(2) memory usage: 17.1+ MB
sns.displot(data = interim_sqrt_lf, x='value', kind='hist', col='sess',
row='variable', facet_kws={'sharey': False, 'sharex': False},
bins=15)
plt.show()
sns.displot(data = interim_sqrt_lf, x='value', hue='sess', col='variable', kind='kde',
col_wrap=3, common_norm=False,
facet_kws={'sharey': False, 'sharex': False})
plt.show()
sns.catplot(data = interim_sqrt_lf, x='sess', y='value', col='variable',
col_wrap=3, hue='sess',
sharex=False, sharey=False, kind='box')
plt.show()
sns.catplot(data = interim_sqrt_lf, x='sess', y='value', hue='interim_pass',
col='variable', kind='point', col_wrap=3, sharex=False,
sharey=False, join=False, errorbar=('ci', 95), dodge=True)
plt.show()
sns.catplot(data = interim_sqrt_lf, x='sess', y='value', hue='interim_pass', col='actv_grp',
row='variable', kind='point', sharex=False,
sharey=False, join=False, errorbar=('ci', 95), dodge=True)
plt.show()
sns.relplot(data = interim_sqrt_lf.loc[interim_sqrt_lf['variable'].isin(totl_vars)],
x='value', y='interim_pass', row='variable', col='sess', kind='scatter',
hue='actv_grp', facet_kws={'sharex': False})
plt.show()
sns.relplot(data = interim_sqrt_lf.loc[interim_sqrt_lf['variable'].isin(mw_vars)],
x='value', y='interim_pass', row='variable', col='sess', kind='scatter',
hue='actv_grp', facet_kws={'sharex': False})
plt.show()
sns.relplot(data = interim_sqrt_lf.loc[interim_sqrt_lf['variable'].isin(mwc_vars)],
x='value', y='interim_pass', row='variable', col='sess', kind='scatter',
hue='actv_grp', facet_kws={'sharex': False})
plt.show()
sns.relplot(data = interim_sqrt_lf.loc[interim_sqrt_lf['variable'].isin(mcl_vars)],
x='value', y='interim_pass', row='variable', col='sess', kind='scatter',
hue='actv_grp', facet_kws={'sharex': False})
plt.show()
sns.relplot(data = interim_sqrt_lf.loc[interim_sqrt_lf['variable'].isin(mcr_vars)],
x='value', y='interim_pass', row='variable', col='sess', kind='scatter',
hue='actv_grp', facet_kws={'sharex': False})
plt.show()
sns.relplot(data = interim_sqrt_lf.loc[interim_sqrt_lf['variable'].isin(mm_vars)],
x='value', y='interim_pass', row='variable', col='sess', kind='scatter',
hue='actv_grp', facet_kws={'sharex': False})
plt.show()
sns.relplot(data = interim_sqrt_lf.loc[interim_sqrt_lf['variable'].isin(ks_vars)],
x='value', y='interim_pass', row='variable', col='sess', kind='scatter',
hue='actv_grp', facet_kws={'sharex': False})
plt.show()